﻿using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace TestProject
{
    class Novel
    {
        public void ReadTest()
        {
            // 文件大小：2,891,224 总字符数：979,116
            string input = @"d:\data\yttlj.txt";
            string alltext = File.ReadAllText(input);
            alltext = FormatText(alltext);

            List<TextItem> items = analyze(alltext, 9); // 3 表示字长。
            foreach (var item in items.Where(item => item.Occurence > 3).OrderByDescending(item => item.Occurence))
            {
                Console.WriteLine(item);
            }
            Console.WriteLine(items.Count);
        }

        private string FormatText(string text)
        {
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i < text.Length; i++)
            {
                if (text[i] >= '\u4e00' && text[i] <= '\u9fa5')
                    sb.Append(text[i]);
            }
            return sb.ToString();
        }

        private List<TextItem> analyze(string text, int keylen)
        {
            Dictionary<string, int> dic = new Dictionary<string, int>();
            for (int i = 0; i < text.Length - keylen; i++)
            {
                string key = text.Substring(i, keylen);
                if (!dic.Keys.Contains(key))
                {
                    dic.Add(key, 1);
                }
                else
                {
                    dic[key] = dic[key] + 1;
                }
            }

            List<TextItem> items = new List<TextItem>();
            foreach (string key in dic.Keys)
            {
                items.Add(new TextItem(key, dic[key]));
            }
            items.OrderByDescending(item => item.Occurence);

            return items;
        }

        class TextItem
        {
            public string Word;
            public int Occurence;
            public TextItem(string word, int occ)
            {
                Word = word;
                Occurence = occ;
            }

            public override string ToString()
            {
                return Word + "\t" + Occurence;
            }
        }
    }
}
